package com.ming.zhihuWebSpider.process;

import com.alibaba.fastjson.JSONObject;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

import java.util.ArrayList;
import java.util.List;

/**
 * 链家二手房交易量
 *  @author xxd
 *  @date: 2020/05/07
 */
@Component
public class LianJiaProcessor implements PageProcessor{

	//static ApplicationContext applicationContext = new ClassPathXmlApplicationContext("classpath*:/config/spring-*.xml");

//	@Autowired
//	private DouyuPipeline douyuPipeline;

	// 首页
	private static final String START_URL  =  "https://fz.lianjia.com/chengjiao/$";

	// 列表页
	private static final String URL_LIST  =  "https://fz.lianjia.com/chengjiao/pg\\w+/$";

	// 详情页
	private static final String URL_DETAIL = "https://fz.lianjia.com/chengjiao/\\w+.html$";

	private Site site = Site.me().setCycleRetryTimes(5).setRetryTimes(5).setSleepTime(300).setTimeOut(3 * 60 * 1000)
			.setUserAgent(
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.65 Safari/537.31");


	
	@Override
	public void process(Page page) {

		// 首页
		if (page.getUrl().regex(START_URL).match()) {
			// 获得区
//			List<String> areas = page.getHtml().xpath("//html/body/div[3]/div[1]/dl[2]/dd/div/div/a").links().all();
//			page.addTargetRequests(areas);
			// 获得区的页
//			List<String> pages = page.getHtml().xpath("//html/body/div[5]/div[1]/div[5]/div[2]/div");
			String pageData = page.getHtml().css(".page-box .house-lst-page-box", "page-data").toString();
			JSONObject jb = JSONObject.parseObject(pageData);
			//取“value”的键值，value里面是个json数组
			Integer totalPage = (Integer) jb.get("totalPage");
			String url = page.getUrl().get();
			List<String> list = new ArrayList<>();
			for (Integer i = 2; i <= totalPage;i++){
				list.add(url+"pg"+i+"/");
			}
			page.addTargetRequests(list);
			page.addTargetRequests(page.getHtml().links().regex(URL_DETAIL).all());
		} else if(page.getUrl().regex(URL_LIST).match()){
		    // 列表页获取
			page.addTargetRequests(page.getHtml().links().regex(URL_DETAIL).all());
		}else if(page.getUrl().regex(URL_DETAIL).match()){
			// 解析详情页
			//System.out.println(page.getUrl());
			String title = page.getHtml().xpath("/html/body/div[4]/div/text()").get();
			System.out.println(title);
		}

	}

	@Override
	public Site getSite() {
		return site;
	}

	public void crawl() {
		Spider.create(new LianJiaProcessor()).addUrl("https://fz.lianjia.com/chengjiao/").thread(1).run();
	}

	public static void main(String[] args) {

		//applicationContext.getBean(LianJiaProcessor.class).crawl();
		new LianJiaProcessor().crawl();
	 }

}
